In [1]:
import graphlab
%matplotlib inline
Download data, which contains data on songs, users and listen counts.
In [2]:
URL = 'https://d396qusza40orc.cloudfront.net/phoenixassets/song_data.csv'
#use SFrame for larger dataset
song_data = graphlab.SFrame(URL)
song_data.head()
Out[2]:
In [3]:
len(song_data)
Out[3]:
In [4]:
# make graphs inline
graphlab.canvas.set_target('ipynb')
# histogram of top songs
song_data['song'].show()
In [5]:
# unique users
users = song_data['user_id'].unique()
len(users)
Out[5]:
In [6]:
# create training and test data with an 80/20 random split
train_data, test_data = song_data.random_split(.8, seed=0)
In [7]:
popularity_model = graphlab.popularity_recommender.create(train_data,
user_id='user_id',
item_id='song')
In [8]:
# model only uses song popularity to make recommendation
popularity_model.recommend(users=[users[0]])
Out[8]:
In [9]:
# recommendations will be the same for every user. This is similar to 'Most Emailed' section of New York Times
popularity_model.recommend(users=[users[1]])
Out[9]:
In [10]:
personalized_model = graphlab.item_similarity_recommender.create(train_data,
user_id='user_id',
item_id='song')
In [11]:
personalized_model.recommend(users=[users[0]])
Out[11]:
In [12]:
personalized_model.recommend(users=[users[1]])
Out[12]:
In [13]:
personalized_model.get_similar_items(['With Or Without You - U2'])
Out[13]:
In [14]:
model_performace = graphlab.recommender.util.compare_models(test_data,
[popularity_model, personalized_model],
user_sample=0.05)
The average precision and recall of model M1 (personalized model) are much higher than model M0 (popularity model)
In [16]:
song_data.groupby(key_columns='artist', operations={'total_count': graphlab.aggregate.SUM('listen_count')}).sort('total_count', ascending=False)
Out[16]:
In [17]:
# for quicker analysis, look at first 10000 unique users
subset_test_users = test_data['user_id'].unique()[0:10000]
In [22]:
# Top recommendation for each user in the subset
top_rec = personalized_model.recommend(subset_test_users,k=1)
In [26]:
# most recommended song in subset
recommendation_count = top_rec.groupby(key_columns='song', operations={'total count': graphlab.aggregate.COUNT('song')})
recommendation_count.sort('total count', ascending=False)
Out[26]:
As expected, the top recommeded songs come from the most popular artists by listen count.
In [ ]: